import pandas as pd
import numpy as np
# Read Excel File
from itertools import islice
import csv
# Visulisation
import missingno as msno
import seaborn as sns
import matplotlib.pyplot as plt
from tabulate import tabulate
import plotly.express as px
from matplotlib import cm
from matplotlib import patches
from bokeh.io import output_file, show
from bokeh.models import ColumnDataSource
from bokeh.palettes import Category20
from bokeh.plotting import figure
from bokeh.transform import factor_cmap
from bokeh.io import output_notebook
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from scipy.spatial import ConvexHull
import dexplot as dxp
# Math Helper
import re
import random
import math
file = csv.reader('downloads/USA_CancerRates_All_ByCounty.csv')
value_by_row = []
with open('downloads/USA_CancerRates_All_ByCounty.csv', encoding='windows-1252') as f:
r = csv.reader(f)
for row in islice(r,11, None):
value_by_row.append(row)
value_by_row
cols_name = value_by_row[0]
values_to_fill = value_by_row[1:len(value_by_row)]
Cancer_rate = pd.DataFrame(values_to_fill,columns=cols_name)
Cancer_rate.head()
Cancer_rate.info()
# Count the Null Value in each column
fig,(ax1,ax2) = plt.subplots(1,2,figsize=(20,10))
na_counts = Cancer_rate.isna().sum()
color_palette = sns.color_palette()[5]
null_values = sns.barplot(na_counts.index.values, na_counts, color = color_palette,ax=ax1)\
.set_title('Null Values Counts')
fig.autofmt_xdate()
msno.matrix(Cancer_rate, figsize=(16, 16), width_ratios=(15, 1),ax=ax2)
# so we can find there is no NA in each column
# We can see that
# there's a lot of different geographic information in first column,
# therefore, we can try to seperate this column.
State = []
Counties = []
SEER = []
NPCR = []
# I will seperate the first row, because it looks differently from other rows
State.append(Cancer_rate.loc[0,'State'].split('(')[0])
Counties.append('Null')
SEER.append(re.split(r'[(),]',Cancer_rate.loc[0,'State'])[3])
NPCR.append(re.split(r'[(),]',Cancer_rate.loc[0,'State'])[4])
# Then I decide to check whether there are special palces
special_place = []
for i in range(0,len(Cancer_rate)):
regex = re.findall('(.*)?\s?(.*),\s(.*)',Cancer_rate['State'][i])
if not regex:
special_place.append(Cancer_rate['State'].loc[i])
print('These places have different pattern from other places, and they are %s' %special_place)
# they are District of Columbia, and Puerto Rico
Cancer_rate[Cancer_rate['State'].str.contains('District of Columbia')|\
Cancer_rate['State'].str.contains('Puerto Rico')]
# we find their indexes are 1947 and 2710
# Then I will seperate other rows firstly
for i in [z for z in range(1,len(Cancer_rate)) if z not in [0,1947,2710]]:
State.append(re.split(r'[(),]',Cancer_rate.loc[i,'State'])[1])
Counties.append(re.split(r'[(),]',Cancer_rate.loc[i,'State'])[0])
SEER.append(re.split(r'[(),]',Cancer_rate.loc[i,'State'])[2])
NPCR.append(re.split(r'[(),]',Cancer_rate.loc[i,'State'])[3])
# Insert two different places into list
for j in ([1947,2710]):
State.insert(j,Cancer_rate.loc[j,'State'].split('(')[0])
Counties.insert(j,'Null')
SEER.insert(j,re.split(r'[(),]',Cancer_rate.loc[j,'State'])[1])
NPCR.insert(j,re.split(r'[(),]',Cancer_rate.loc[j,'State'])[2])
Cancer_rate['State'] = State
Cancer_rate['Counties'] = Counties
Cancer_rate['SEER'] = SEER
Cancer_rate['NPCR'] = NPCR
Cancer_rate['State'] = Cancer_rate['State'].apply(lambda x: x.strip())
Cancer_rate.head()
# Then I still need to double check
# 1. all categorical variable should not contain numerical number
# 2. all numerical column should only contain number
# 1.
cat_count = {}
cat_list = ['State','Recent Trend','Counties']
for i in cat_list:
cat_count['{0}_count'.format(i)] = 0
for value in Cancer_rate[i]:
if not value.isnumeric():
cat_count['{0}_count'.format(i)] += 1
print('The number of character in each columns: \n')
print(tabulate(pd.DataFrame(cat_count.items(), columns=['variable_name', 'count'])))
# so all categorical variables do not have any number
# 2.
num_list = [i for i in Cancer_rate.columns if i not in cat_list]
num_df = pd.DataFrame([],columns=['all_number','index','count'],index=num_list)
for value in num_list:
num_df.loc[value,'all_number']= pd.to_numeric(Cancer_rate[value].str.lstrip('-'), errors='coerce').notnull().all()
if num_df.loc[value,'all_number'] == True:
num_df.loc[value,'index'] = 0
if num_df.loc[value,'all_number'] == False:
index = []
for i in range(0,2932):
if Cancer_rate.loc[i,value].lstrip('-').replace('.','',1).isdigit() == False:
index.append(i)
num_df.loc[value,'index'] = index
num_df.loc[value,'count'] = len(index)
num_df = num_df.fillna({'count':0})
print("\nNumber of not numerical values in each columns: \n")
display(num_df)
# then we randomly select several rows to double check
print(Cancer_rate.loc[1407])
print('------------')
print(Cancer_rate.loc[324])
print('------------')
print(Cancer_rate.loc[543])
# we can find except Age-Adjusted column,
# all other four columns are missed in the same row.
# there should be some relationship between Recent 5-year Trend and NPCR
# Then I find some columns' names are too long
# I decide to shorten them before data cleaning
column_names = Cancer_rate.columns
Cancer_rate.rename(columns={column_names[2]:'Cases_Per100000',column_names[3]:'Cases_Per100000_LCI',column_names[4]:'Cases_Per100000_UCI',\
column_names[7]:'Trend_in_5_year',column_names[8]:'Trend_LCI',column_names[9]:'Trend_UCI'},inplace=True)
Cancer_rate.head()
# Then I will clean Cases_Per100000 firstly (original name is Age-AdjustedCANCERcases_per100000)
pattern = '[\d+.\d\s\b\s]'
age_count = 0
for i in num_df.iloc[1,1]:
if bool(re.match(pattern, Cancer_rate.loc[i,'Cases_Per100000'])) == True:
age_count += 1
print(age_count == num_df.iloc[1,2])
# Therefore, the different values are following same pattern
# we can change them at one time
for j in num_df.iloc[1,1]:
Cancer_rate.loc[j,'Cases_Per100000'] = Cancer_rate.loc[j,'Cases_Per100000'].split(' ')[0]
for k in num_df.iloc[5,1]:
print(Cancer_rate.loc[k])
# Because Director Ami does not allow us to use other data
# so I decide to drop these rows with '*'
# let me introduce the meaning of this '*' firstly
# '*' means:
# Data has been suppressed to ensure confidentiality and stability of rate and trend estimates.
# although I will not fill this row, I still find the report from Georgia
# the report's website is attached
# https://statecancerprofiles.cancer.gov/ratetrendbycancer/index.php?cancer=001&sex=1&stateFIPS=13&comparison=00&type=rtcancer
Cancer_rate = Cancer_rate[Cancer_rate['Recent Trend'] != '*'].reset_index(drop=True)
Cancer_rate.info()
# And then we can change column dtypes
# to begin with, we need to check whether all value is numeric in our selected columns that will be transfered to 'float'
categorical_column = ['State','Recent Trend','Counties']
for i in Cancer_rate.columns:
if i in categorical_column:
Cancer_rate[i] = Cancer_rate[i].astype('category')
else:
Cancer_rate[i] = Cancer_rate[i].astype('float')
print(Cancer_rate.dtypes)
cases100000_mean = pd.DataFrame(Cancer_rate.groupby('State')['Cases_Per100000'].mean()).reset_index(level=0)
mean_cases_per100000 = cases100000_mean['Cases_Per100000'].mean()
fig = plt.figure(figsize=(20,20))
sns.set(font_scale=1.3)
sns.set_style("whitegrid")
p = sns.barplot(y='State',x='Cases_Per100000',data=cases100000_mean,palette="Set3")
p.axvline(mean_cases_per100000, ls='-',color='cyan',lw=5)
bbox_props = dict(boxstyle="larrow,pad=0.5", fc="cyan", ec="b", lw=3)
t = p.text(550, 24, "Line represented the mean value in the US", ha="center", va="center",
size=15,
bbox=bbox_props)
plt.title('Mean Cancer Cases Per 100000 people in each state')
fig.autofmt_xdate()
# we can find half states are cross the mean line
count = Cancer_rate.groupby('State').count().sort_values(by='Cases_Per100000',ascending=False).iloc[:,1]
output_notebook()
source = ColumnDataSource(data=dict(states=count.index.tolist(), counts=count.values))
palette = list(Category20.values())[16] + list(Category20.values())[15]
p = figure(x_range=count.index.tolist(), plot_height=300, title="Counts of each states")
p.vbar(x='states', top='counts', width=0.9, source=source,
line_color='white', fill_color=factor_cmap('states', palette=palette, factors=count.index.tolist()))
p.xaxis.major_label_orientation = math.pi/2
show(p)
# In order to see name clearly
# I abbreviate the state name
plt.figure(figsize=(40,40))
copy_cancer = Cancer_rate.copy()
abbrev = {
'Alabama': 'AL',
'Alaska': 'AK',
'American Samoa': 'AS',
'Arizona': 'AZ',
'Arkansas': 'AR',
'California': 'CA',
'Colorado': 'CO',
'Connecticut': 'CT',
'Delaware': 'DE',
'District of Columbia': 'DC',
'Florida': 'FL',
'Georgia': 'GA',
'Guam': 'GU',
'Hawaii': 'HI',
'Idaho': 'ID',
'Illinois': 'IL',
'Indiana': 'IN',
'Iowa': 'IA',
'Kansas': 'KS',
'Kentucky': 'KY',
'Louisiana': 'LA',
'Maine': 'ME',
'Maryland': 'MD',
'Massachusetts': 'MA',
'Michigan': 'MI',
'Minnesota': 'MN',
'Mississippi': 'MS',
'Missouri': 'MO',
'Montana': 'MT',
'Nebraska': 'NE',
'Nevada': 'NV',
'New Hampshire': 'NH',
'New Jersey': 'NJ',
'New Mexico': 'NM',
'New York': 'NY',
'North Carolina': 'NC',
'North Dakota': 'ND',
'Northern Mariana Islands':'MP',
'Ohio': 'OH',
'Oklahoma': 'OK',
'Oregon': 'OR',
'Pennsylvania': 'PA',
'Puerto Rico': 'PR',
'Rhode Island': 'RI',
'South Carolina': 'SC',
'South Dakota': 'SD',
'Tennessee': 'TN',
'Texas': 'TX',
'Utah': 'UT',
'Vermont': 'VT',
'Virgin Islands': 'VI',
'Virginia': 'VA',
'Washington': 'WA',
'West Virginia': 'WV',
'Wisconsin': 'WI',
'Wyoming': 'WY'
}
copy_cancer['State'] = copy_cancer['State'].map(abbrev)
# just select 30 states
sample_list = random.sample(copy_cancer['State'].tolist(),30)
copy_cancer.groupby('State')
b = dxp.aggplot(agg='State',data=copy_cancer[copy_cancer['State'].isin(sample_list)],hue='Recent Trend',normalize='State',orient='h',figsize=(40,40))
b.tick_params(labelsize=30)
b.legend(loc=4,prop={'size':50})
fig.autofmt_xdate()
# Then I will draw two different types of plot
# 1. count of falling is larger than count of stable
# 2. count of rising is larger than count of stable
trend_count = copy_cancer.groupby(['Recent Trend','State'])['Recent Trend'].count().unstack().transpose().rename(columns=str).reset_index()
trend_count['falling - stable'] = trend_count['falling'] - trend_count['stable']
trend_count['rising_or_not'] = [i if i > 0 else 0 for i in trend_count['rising']]
def color_negative_red(value):
"""
Colors elements in a dateframe
green if positive and red if
negative. Does not color NaN
values.
"""
if value < 0:
color = 'red'
elif value > 0:
color = 'green'
else:
color = 'yellow'
return 'background-color: %s' % color
trend_count.style.applymap(color_negative_red, subset=['falling - stable','rising_or_not'])
# For the EDA part, I will pick 4 states
# two of them should have the most cases per 10000 people
# and two of them should have the least cases per 10000 people
# and all of them should at least 40 rows data in order to make generally large sample size
state_larger_than_40 = count[count>40].index.tolist()
subset_larger_than_40 = Cancer_rate.query("State in @state_larger_than_40").reset_index(drop=True)
subset_larger_than_40.groupby('State')['Cases_Per100000'].mean().nlargest(3)
groupby_state = subset_larger_than_40.groupby('State')['Cases_Per100000'].mean().nsmallest(3)
groupby_state
# Then I decide to do EDA for Texas, Georgia, Idaho and South Carolina
EDA_subset = Cancer_rate[Cancer_rate['State'].isin(['Illinois','California','Texas','New York','Colorado','Kentucky'])].reset_index(drop=True)
EDA_subset
sns.set_style('white')
categories = np.unique(EDA_subset['State'])
colors = [plt.cm.tab10(i/float(len(categories)-1)) for i in range(len(categories))]
fig = plt.figure(figsize=(16,10))
for i,category in enumerate(categories):
plt.scatter('Trend_in_5_year','Cases_Per100000',data=EDA_subset.loc[EDA_subset.State==category,:],
cmap=colors[i],s=60,
label=str(category),edgecolors='black',linewidths=0.5)
def encircle(x,y, ax=None, **kw):
if not ax:
ax=plt.gca()
p = np.c_[x,y]
hull = ConvexHull(p)
poly = plt.Polygon(p[hull.vertices,:], **kw)
ax.add_patch(poly)
encircle_data = EDA_subset.loc[EDA_subset.State.isin(['Illinois','Kentucky','New York']),:]
encircle_data = encircle_data.drop(encircle_data['Trend_in_5_year'].idxmax())
encircle_data = encircle_data.drop(encircle_data['Trend_in_5_year'].idxmin())
x = encircle_data.Trend_in_5_year
y = encircle_data.Cases_Per100000
encircle(x,y, ec="k", fc="gold", alpha=0.1)
encircle(x,y, ec="firebrick", fc="none", linewidth=1.5)
plt.gca().set(xlabel='Trend in 5 years', ylabel='Cases Per 100000 people')
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.title("Distribution of Cases Per 100000 and Trend in 5 years", fontsize=22)
plt.legend(fontsize=15)
plt.show()
# we can find the data for three states with larger mean value of cases_per10000 are included in small polygon
# we also can draw the distribution plot for these states
plt.figure(figsize=(20,16),dpi=40)
states = ['Illinois','New York','Kentucky','California','Texas','Colorado']
colors = ['darkorange','darkgoldenrod','olive','darkslateblue','rebeccapurple','palevioletred']
for color,state in zip(colors,states):
sns.set(font_scale=2)
sns.kdeplot(EDA_subset.loc[EDA_subset['State'] == state,'Cases_Per100000'],shade=True,color=color,label=state,alpha=0.5)
leg = plt.legend()
for line in leg.get_lines():
line.set_linewidth(10)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.title('Density Plot for Cases Per 100000',fontsize=20)
# we can find each states followed normal distribution
# and we can split them by this black line
# draw the boxplot with lower bound and upper bound for Cancer Cases
def plotly_errorbars(state):
dataset = EDA_subset[EDA_subset['State'] == state].sample(n=20)
dataset['Cases_UCI'] = dataset['Cases_Per100000_UCI'] - dataset['Cases_Per100000']
dataset['Cases_LCI'] = dataset['Cases_Per100000'] - dataset['Cases_Per100000_LCI']
dataset['Trend_UCI_diff'] = dataset['Trend_UCI'] - dataset['Trend_in_5_year']
dataset['Trend_LCI_diff'] = dataset['Trend_in_5_year'] - dataset['Trend_LCI']
fig1 = px.scatter(dataset, x="Counties", y="Cases_Per100000",color='Trend_in_5_year',
error_y="Cases_LCI", error_y_minus="Cases_UCI")
fig1.data[0].update(mode='markers',marker=dict(size=10))
trace = fig1['data'][0]
return trace,dataset
fig = make_subplots(
rows=2, cols=2, subplot_titles=("Kentucky", "New York", "Texas", "California")
)
trace1,dataset1 = plotly_errorbars('Kentucky')
Kentucky = dataset1
trace2,dataset2 = plotly_errorbars('New York')
New_York = dataset2
trace3,dataset3 = plotly_errorbars('Texas')
Texas = dataset3
trace4,dataset4 = plotly_errorbars('California')
California = dataset4
fig.update_layout(
margin=dict(l=30, r=30, t=30, b=60),
paper_bgcolor="LightSteelBlue",
width = 800,
height = 1200
)
fig.add_trace(trace1,row=1,col=1)
fig.add_trace(go.Scatter(
x = Kentucky['Counties'],
y = Kentucky['Cases_Per100000']),row=1,col=1)
fig.add_trace(trace2,row=1,col=2)
fig.add_trace(go.Scatter(
x = New_York['Counties'],
y = New_York['Cases_Per100000']),row=1,col=2)
fig.add_trace(trace3,row=2,col=1)
fig.add_trace(go.Scatter(
x = Texas['Counties'],
y = Texas['Cases_Per100000']),row=2,col=1)
fig.add_trace(trace4,row=2,col=2)
fig.add_trace(go.Scatter(
x = California['Counties'],
y = California['Cases_Per100000']),row=2,col=2)
fig.layout.update(showlegend=False)
fig.show()
# draw the boxplot with lower bound and upper bound for Recent Trend
fig,ax = plt.subplots(2,2,figsize=(50,50))
states = [New_York,California,Kentucky,Texas]
state_names = ['New_York','California','Kentucky','Texas']
for i,dataset in enumerate(states[0:2]):
plt.subplot(2,2,i+1)
norm = plt.Normalize(dataset['Trend_in_5_year'].min(), dataset['Trend_in_5_year'].max())
cmap = plt.get_cmap("BrBG")
plt.bar(x="Counties", height="Trend_in_5_year", yerr=[dataset['Trend_LCI_diff'],dataset['Trend_UCI_diff']],data=dataset, color=cmap(norm(dataset['Trend_in_5_year'].values)))
sm = cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
cbar = plt.colorbar(sm)
plt.xticks(rotation=90)
plt.title(state_names[i],fontsize=40)
for i,dataset in enumerate(states[2:4]):
plt.subplot(2,2,i+3)
norm = plt.Normalize(dataset['Trend_in_5_year'].min(), dataset['Trend_in_5_year'].max())
cmap = plt.get_cmap("PiYG")
plt.bar(x="Counties", height="Trend_in_5_year", yerr=[dataset['Trend_LCI_diff'],dataset['Trend_UCI_diff']],data=dataset, color=cmap(norm(dataset['Trend_in_5_year'].values)))
sm = cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
cbar = plt.colorbar(sm)
plt.xticks(rotation=90)
plt.title(state_names[i+2],fontsize=40)
Cancer_rate.to_csv('Cancer_rate_cleaned.csv',index=False)